require(tidyverse)
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
sal_data = read.csv('C:/Users/prana/Desktop/IS-680-R/Project_Datasets/Levels_Fyi_Salary_Data.csv')
sal_data
names(sal_data)
## [1] "timestamp" "company"
## [3] "level" "title"
## [5] "totalyearlycompensation" "location"
## [7] "yearsofexperience" "yearsatcompany"
## [9] "tag" "basesalary"
## [11] "stockgrantvalue" "bonus"
## [13] "gender" "otherdetails"
## [15] "cityid" "dmaid"
## [17] "rowNumber" "Masters_Degree"
## [19] "Bachelors_Degree" "Doctorate_Degree"
## [21] "Highschool" "Some_College"
## [23] "Race_Asian" "Race_White"
## [25] "Race_Two_Or_More" "Race_Black"
## [27] "Race_Hispanic" "Race"
## [29] "Education"
ncol(sal_data)
## [1] 29
Checking for the number of columns
nrow(sal_data)
## [1] 62642
Checking for the number of rows
summary(sal_data)
## timestamp company level title
## Length:62642 Length:62642 Length:62642 Length:62642
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## totalyearlycompensation location yearsofexperience yearsatcompany
## Min. : 10000 Length:62642 Min. : 0.000 Min. : 0.000
## 1st Qu.: 135000 Class :character 1st Qu.: 3.000 1st Qu.: 0.000
## Median : 188000 Mode :character Median : 6.000 Median : 2.000
## Mean : 216300 Mean : 7.204 Mean : 2.702
## 3rd Qu.: 264000 3rd Qu.:10.000 3rd Qu.: 4.000
## Max. :4980000 Max. :69.000 Max. :69.000
##
## tag basesalary stockgrantvalue bonus
## Length:62642 Min. : 0 Min. : 0 Min. : 0
## Class :character 1st Qu.: 108000 1st Qu.: 0 1st Qu.: 1000
## Mode :character Median : 140000 Median : 25000 Median : 14000
## Mean : 136687 Mean : 51486 Mean : 19335
## 3rd Qu.: 170000 3rd Qu.: 65000 3rd Qu.: 26000
## Max. :1659870 Max. :2800000 Max. :1000000
##
## gender otherdetails cityid dmaid
## Length:62642 Length:62642 Min. : 0 Min. : 0.0
## Class :character Class :character 1st Qu.: 7369 1st Qu.:506.0
## Mode :character Mode :character Median : 7839 Median :807.0
## Mean : 9856 Mean :616.1
## 3rd Qu.:11521 3rd Qu.:807.0
## Max. :47926 Max. :881.0
## NA's :2
## rowNumber Masters_Degree Bachelors_Degree Doctorate_Degree
## Min. : 1 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:20069 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :42019 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :41695 Mean :0.2457 Mean :0.2012 Mean :0.02878
## 3rd Qu.:63022 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :83875 Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## Highschool Some_College Race_Asian Race_White
## Min. :0.000000 Min. :0.000000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.000000 Median :0.000000 Median :0.0000 Median :0.0000
## Mean :0.005108 Mean :0.005667 Mean :0.1879 Mean :0.1282
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.000000 Max. :1.000000 Max. :1.0000 Max. :1.0000
##
## Race_Two_Or_More Race_Black Race_Hispanic Race
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Length:62642
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 Class :character
## Median :0.00000 Median :0.00000 Median :0.00000 Mode :character
## Mean :0.01283 Mean :0.01102 Mean :0.01804
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000
##
## Education
## Length:62642
## Class :character
## Mode :character
##
##
##
##
ggplot(sal_data, aes(totalyearlycompensation), fill = title) +
geom_histogram() +
facet_wrap(~title)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Looking a t the histogram we can say that Software Engineer earns more compared to all other titles.
ggplot(sal_data, aes(totalyearlycompensation)) +
geom_boxplot() +
coord_flip()
q1 <- quantile(sal_data$totalyearlycompensation, 0.25)
q3 <- quantile(sal_data$totalyearlycompensation, 0.75)
iqr <- IQR(sal_data$totalyearlycompensation)
q1
## 25%
## 135000
q3
## 75%
## 264000
iqr
## [1] 129000
no_outliers <- subset(sal_data, sal_data$totalyearlycompensation > (q1 - 1.5*iqr) & sal_data$totalyearlycompensation < (q3 + 1.5*iqr))
no_outliers
nrow(sal_data) - nrow(no_outliers)
## [1] 3133
ggplot(no_outliers, aes(totalyearlycompensation)) +
geom_boxplot() +
coord_flip()
Now as you can see in the above boxplot there are no outliers in the column.
tot_sal <- sal_data %>%
group_by(company) %>%
summarize(tot_salry = sum(totalyearlycompensation)) %>%
arrange(desc(tot_salry))
tot_sal
Here in the chunk above we got the companies that offer more compensation by sorting ‘totalyearlycompensation’ column.
tot_sal$company[1:10]
## [1] "Amazon" "Google" "Microsoft" "Facebook" "Apple"
## [6] "Salesforce" "Uber" "Oracle" "LinkedIn" "Cisco"
In the above two chunk the Top 10 companies who offer more compensation to their employees has been extracted.
lvl_exp <- sal_data %>%
select(level,yearsofexperience,location) %>%
mutate(most_exp = yearsofexperience > 40) %>%
group_by(level, location) %>%
arrange(desc(most_exp))
lvl_exp
Displaying the levels which have experience more than 40 years.
lvl_exp$level[1:6]
## [1] "L6" "L6" "SDE 5" "Consulting MTS"
## [5] "Band 9" "5"
Top 6 levels which have an experience of more than 40 years.
cmp_bns <- no_outliers %>%
select(company, bonus) %>%
group_by(company) %>%
summarise(total_bns = sum(bonus)) %>%
arrange(desc(total_bns))
cmp_bns
The above chunk elaborates about the companies which gave more bonus compared to other companies in descending order.
names(sal_data)
## [1] "timestamp" "company"
## [3] "level" "title"
## [5] "totalyearlycompensation" "location"
## [7] "yearsofexperience" "yearsatcompany"
## [9] "tag" "basesalary"
## [11] "stockgrantvalue" "bonus"
## [13] "gender" "otherdetails"
## [15] "cityid" "dmaid"
## [17] "rowNumber" "Masters_Degree"
## [19] "Bachelors_Degree" "Doctorate_Degree"
## [21] "Highschool" "Some_College"
## [23] "Race_Asian" "Race_White"
## [25] "Race_Two_Or_More" "Race_Black"
## [27] "Race_Hispanic" "Race"
## [29] "Education"
city_max <- no_outliers %>%
group_by(location) %>%
summarize(max_sal = max(totalyearlycompensation)) %>%
arrange(desc(max_sal))
city_max
From the above chunk you can extract the info. like in which city you are most likely to get paid more.
unique(sal_data[c("gender")])
sum(is.na(no_outliers$gender))
## [1] 18315
no_na <- na.omit(sal_data)
no_na_ot <- na.omit(no_outliers)
no_na
no_na_ot
ggplot(data = sal_data,
mapping = aes(x = totalyearlycompensation)) +
geom_histogram(alpha = 0.5, bins = 50) +
labs(x = "yearly_compensation", y = "number_of_employees",
title = "Compensation based on gender") +
facet_wrap(~gender)
ggplot(data = no_outliers,
mapping = aes(x = totalyearlycompensation)) +
geom_histogram(alpha = 0.5, bins = 50) +
labs(x = "yearly_compensation", y = "number_of_employees",
title = "Compensation based on gender") +
facet_wrap(~gender)
As you can see the difference in the above and below histogram with the same options. The output for the second histogram which is data without any outliers in it gave us the best distribution.
ggplot(data = no_na,
mapping = aes(x = totalyearlycompensation)) +
geom_histogram(alpha = 0.5, bins = 50) +
labs(x = "yearly_compensation", y = "number_of_employees",
title = "Compensation based on gender") +
facet_wrap(~gender)
ggplot(data = no_na_ot,
mapping = aes(x = totalyearlycompensation)) +
geom_histogram(alpha = 0.5, bins = 50) +
labs(x = "yearly_compensation", y = "number_of_employees",
title = "Compensation based on gender") +
facet_wrap(~gender)
ggplot(data = no_outliers,
mapping = aes(x = title, y = yearsofexperience)) +
geom_point(aes(colour = gender)) +
theme_bw() +
coord_flip()
ggplot(data = no_na_ot,
mapping = aes(x = title, y = yearsofexperience, fill = Masters_Degree)) +
geom_point(aes(colour = gender)) +
theme_bw() +
coord_flip()